This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(tidyverse)
tidyverse_packages() # which packages are in tidyverse
[1] "broom" "dplyr" "forcats" "ggplot2" "haven" "httr" "hms" "jsonlite"
[9] "lubridate" "magrittr" "modelr" "purrr" "readr" "readxl" "stringr" "tibble"
[17] "rvest" "tidyr" "xml2" "tidyverse"
# List vignettes from all *attached* packages
vignette(all = FALSE)
# List vignettes from all *installed* packages (can take a long time!):
vignette(all = TRUE)
# find vignettes of "ggplot2"
vignette(package = "ggplot2")
# view vignette "ggplot2-specs"
vignette("ggplot2-specs")
now look for more information on ggplot
demo(graphics) # A show of some of R's graphics capabilities, run in console
demo(graphics)
---- ~~~~~~~~
> # Copyright (C) 1997-2009 The R Core Team
>
> require(datasets)
> require(grDevices); require(graphics)
> ## Here is some code which illustrates some of the differences between
> ## R and S graphics capabilities. Note that colors are generally specified
> ## by a character string name (taken from the X11 rgb.txt file) and that line
> ## textures are given similarly. The parameter "bg" sets the background
> ## parameter for the plot and there is also an "fg" parameter which sets
> ## the foreground color.
>
>
> x <- stats::rnorm(50)
> opar <- par(bg = "white")
> plot(x, ann = FALSE, type = "n")
> abline(h = 0, col = gray(.90))
> lines(x, col = "green4", lty = "dotted")
> points(x, bg = "limegreen", pch = 21)
> title(main = "Simple Use of Color In a Plot",
+ xlab = "Just a Whisper of a Label",
+ col.main = "blue", col.lab = gray(.8),
+ cex.main = 1.2, cex.lab = 1.0, font.main = 4, font.lab = 3)
> ## A little color wheel. This code just plots equally spaced hues in
> ## a pie chart. If you have a cheap SVGA monitor (like me) you will
> ## probably find that numerically equispaced does not mean visually
> ## equispaced. On my display at home, these colors tend to cluster at
> ## the RGB primaries. On the other hand on the SGI Indy at work the
> ## effect is near perfect.
>
> par(bg = "gray")
> pie(rep(1,24), col = rainbow(24), radius = 0.9)
> title(main = "A Sample Color Wheel", cex.main = 1.4, font.main = 3)
> title(xlab = "(Use this as a test of monitor linearity)",
+ cex.lab = 0.8, font.lab = 3)
> ## We have already confessed to having these. This is just showing off X11
> ## color names (and the example (from the postscript manual) is pretty "cute".
>
> pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12)
> names(pie.sales) <- c("Blueberry", "Cherry",
+ "Apple", "Boston Cream", "Other", "Vanilla Cream")
> pie(pie.sales,
+ col = c("purple","violetred1","green3","cornsilk","cyan","white"))
> title(main = "January Pie Sales", cex.main = 1.8, font.main = 1)
> title(xlab = "(Don't try this at home kids)", cex.lab = 0.8, font.lab = 3)
> ## Boxplots: I couldn't resist the capability for filling the "box".
> ## The use of color seems like a useful addition, it focuses attention
> ## on the central bulk of the data.
>
> par(bg="cornsilk")
> n <- 10
> g <- gl(n, 100, n*100)
> x <- rnorm(n*100) + sqrt(as.numeric(g))
> boxplot(split(x,g), col="lavender", notch=TRUE)
> title(main="Notched Boxplots", xlab="Group", font.main=4, font.lab=1)
> ## An example showing how to fill between curves.
>
> par(bg="white")
> n <- 100
> x <- c(0,cumsum(rnorm(n)))
> y <- c(0,cumsum(rnorm(n)))
> xx <- c(0:n, n:0)
> yy <- c(x, rev(y))
> plot(xx, yy, type="n", xlab="Time", ylab="Distance")
> polygon(xx, yy, col="gray")
> title("Distance Between Brownian Motions")
> ## Colored plot margins, axis labels and titles. You do need to be
> ## careful with these kinds of effects. It's easy to go completely
> ## over the top and you can end up with your lunch all over the keyboard.
> ## On the other hand, my market research clients love it.
>
> x <- c(0.00, 0.40, 0.86, 0.85, 0.69, 0.48, 0.54, 1.09, 1.11, 1.73, 2.05, 2.02)
> par(bg="lightgray")
> plot(x, type="n", axes=FALSE, ann=FALSE)
> usr <- par("usr")
> rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
> lines(x, col="blue")
> points(x, pch=21, bg="lightcyan", cex=1.25)
> axis(2, col.axis="blue", las=1)
> axis(1, at=1:12, lab=month.abb, col.axis="blue")
> box()
> title(main= "The Level of Interest in R", font.main=4, col.main="red")
> title(xlab= "1996", col.lab="red")
> ## A filled histogram, showing how to change the font used for the
> ## main title without changing the other annotation.
>
> par(bg="cornsilk")
> x <- rnorm(1000)
> hist(x, xlim=range(-4, 4, x), col="lavender", main="")
> title(main="1000 Normal Random Variates", font.main=3)
> ## A scatterplot matrix
> ## The good old Iris data (yet again)
>
> pairs(iris[1:4], main="Edgar Anderson's Iris Data", font.main=4, pch=19)
> pairs(iris[1:4], main="Edgar Anderson's Iris Data", pch=21,
+ bg = c("red", "green3", "blue")[unclass(iris$Species)])
> ## Contour plotting
> ## This produces a topographic map of one of Auckland's many volcanic "peaks".
>
> x <- 10*1:nrow(volcano)
> y <- 10*1:ncol(volcano)
> lev <- pretty(range(volcano), 10)
> par(bg = "lightcyan")
> pin <- par("pin")
> xdelta <- diff(range(x))
> ydelta <- diff(range(y))
> xscale <- pin[1]/xdelta
> yscale <- pin[2]/ydelta
> scale <- min(xscale, yscale)
> xadd <- 0.5*(pin[1]/scale - xdelta)
> yadd <- 0.5*(pin[2]/scale - ydelta)
> plot(numeric(0), numeric(0),
+ xlim = range(x)+c(-1,1)*xadd, ylim = range(y)+c(-1,1)*yadd,
+ type = "n", ann = FALSE)
> usr <- par("usr")
> rect(usr[1], usr[3], usr[2], usr[4], col="green3")
> contour(x, y, volcano, levels = lev, col="yellow", lty="solid", add=TRUE)
> box()
> title("A Topographic Map of Maunga Whau", font= 4)
> title(xlab = "Meters North", ylab = "Meters West", font= 3)
> mtext("10 Meter Contour Spacing", side=3, line=0.35, outer=FALSE,
+ at = mean(par("usr")[1:2]), cex=0.7, font=3)
> ## Conditioning plots
>
> par(bg="cornsilk")
> coplot(lat ~ long | depth, data = quakes, pch = 21, bg = "green3")
> par(opar)
note that the pipe can be run in parts (short cut Ctrl+Shift+M, CMD+SHIFT+M )
“ggplot” is part of the “tidyverse” and a widely used package to work with graphics note for ggplot there is “+” to combine commands, in contrast to “% > %” which is the pipe operator for commands outside ggplot
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, size = cty))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
note there are only 6 different shapes, therefore “suv” has no shape and is not displayed
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
If there is a variable value which separates data it can be used to create multiple plots rather than multiple lines in one plot.
facet_wrap wraps a 1d sequence of panels into 2d
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
facet_grid forms a matrix of panels defined by row and column facetting variables.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
Analyse available data set in ggplot2! The data sets are listed and explained @ http://docs.ggplot2.org
Use - size - color - alpha - shape
to emphasise you message
midwest %>% distinct(state)
ggplot(midwest, aes(x= area, y = poptotal, color = percchildbelowpovert)) + geom_point() + facet_wrap(~ state, nrow = 3) + scale_y_log10() + scale_x_log10()
different ways to present the same data
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy))
draw a different line, with a different linetype, for each unique value of the variable that you map to linetype
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv, color = drv))
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, group = drv))
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
geom_smooth(mapping = aes(x = displ, y = hwy))
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth()
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
CHEAT SHEETS are at your fingertips under HELP menu of RStudio IDE or https://www.rstudio.com/resources/cheatsheets/
stackoverflow is a vivid community http://stackoverflow.com
RDocumentation searches CRAN, BioConductor and Github packages https://www.rdocumentation.org
ggplot(data = diamonds) +
geom_boxplot(mapping = aes(x = cut, y = price, color = cut))
gives good impression of distribution
ggplot(data = diamonds) +
geom_violin(mapping = aes(x = cut, y = price, color = cut))
A histogram is a graphical representation of the distribution of numerical data.
https://de.wikipedia.org/wiki/Histogramm
ggplot(diamonds, aes(carat)) +
geom_histogram()
# set binwidth
ggplot(diamonds, aes(carat)) +
geom_histogram(binwidth = 0.01)
# set number of bins
ggplot(diamonds, aes(carat)) +
geom_histogram(bins = 200)
# Rather than stacking histograms, it's easier to compare frequency
# polygons
ggplot(diamonds, aes(price, fill = cut)) +
geom_histogram(binwidth = 500)
ggplot(diamonds, aes(price, colour = cut)) +
geom_freqpoly(binwidth = 500)
work with densities, means each curve has area of one
# To make it easier to compare distributions with very different counts,
# put density on the y axis instead of the default count
ggplot(diamonds, aes(price, ..density.., colour = cut)) +
geom_freqpoly(binwidth = 500)
The empirical distribution function estimates the cumulative distribution function underlying of the points in the sample and converges with probability 1
https://de.wikipedia.org/wiki/Empirische_Verteilungsfunktion
df <- data.frame(x = rnorm(10000))
ggplot(df, aes(x)) +
geom_histogram()
ggplot(df, aes(x)) + stat_ecdf(geom = "step")
p <- ggplot(df, aes(x)) + stat_ecdf()
pg <- ggplot_build(p)$data[[1]]
ggplot(pg, aes(x = x, y = 1-y )) + geom_step() + scale_y_log10()
In statistics relationship between two variables.
library(corrplot)
cor_iris <- cor(iris %>% select(-Species))
corrplot.mixed(cor_iris)
further details on the corrplot package can be found in the vignette
vignette("corrplot-intro")
In statistics, the maximal information coefficient (MIC) is a measure of the strength of the linear or non-linear association between two variables X and Y.
library(minerva)
compare_mic_r = function(x, y){
cat( "MIC:", mine(x,y)$MIC, ";", "correlation: ", cor(x,y), "\n")
}
x <- runif(n=1000, min=0, max=1)
y2 <- 4*(x-0.5)^2; plot(sort(x),y2[order(x)],type="l"); compare_mic_r(x,y2)
MIC: 1 ; correlation: -0.04411526
y3 <- sin(6*pi*x*(1+x)); plot(sort(x),y3[order(x)],type="l"); compare_mic_r(x,y3)
MIC: 1 ; correlation: -0.111569
t <- seq(from=0,to=2*pi,length.out=1000)
x4 <- cos(t); y4 <- sin(t); plot(x4, y4, type="l",asp=1); compare_mic_r(x4,y4)
MIC: 0.6829015 ; correlation: 5.798018e-18
Explore a data set even further use
use
need help?
library(nycflights13)
flights
filter all rows where month == 1 and day == 1, multiple filter conditions are separated by “,” and are treated as logical “AND”
note, if you wrap the expression in () then the result will be displayed even when the result is assigned to a variable.
The
(xmas_flights <- filter(flights, month == 12, day == 24))
?basic::Logic
Error in find.package(if (is.null(package)) loadedNamespaces() else package, :
there is no package called ‘basic’
the following expressions give the same result
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
arrange(flights, year, month, day)
also an easy way to bring columns in a specific order
select all but a range of columns
select(flights, -(year:day))
more can be found in the cheatsheet
note the %>% operator
select(flights,
year:day,
ends_with("delay"),
distance,
air_time) %>%
mutate(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60,
hours = air_time / 60,
gain_per_hour = gain / hours) %>%
select(-c(month, day, speed))
if you only want to keep the new columns use “transmute()”
select(flights,
year:day,
ends_with("delay"),
distance,
air_time) %>%
transmute(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60,
hours = air_time / 60,
gain_per_hour = gain / hours)
the mean of all depature delays
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
# na.rm a logical value indicating whether NA values should be stripped before the computation proceeds.
summarise(flights, delay = mean(dep_delay, na.rm = TRUE)) %>% as.numeric()
[1] 12.63907
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
find pattern of delays during the year
by_day <- flights %>% group_by(year, month)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE)) %>% ggplot(aes( x = month, y = delay, group = month)) +
geom_col()
not_cancelled <- flights %>%
filter(!is.na(arr_delay))
not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
) %>%
ggplot( mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)
there seems a few planes with very high mean delay. Lets look closer into the issue
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
the high delays are for tailnum wiht limited number of flight. Lets choose only tailnums where at least 25 flights are recorded
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
what if we want to select the points under consideration not via a limit but from a plot? Use Shiny Gadgets
library(shiny)
library(miniUI)
ggbrush <- function(data, xvar, yvar) {
ui <- miniPage(
gadgetTitleBar("Drag to select points"),
miniContentPanel(
# The brush="brush" argument means we can listen for
# brush events on the plot using input$brush.
plotOutput("plot", height = "100%", brush = "brush")
)
)
server <- function(input, output, session) {
# Render the plot
output$plot <- renderPlot({
# Plot the data with x/y vars indicated by the caller.
ggplot(data, aes_string(xvar, yvar)) + geom_point()
})
# Handle the Done button being pressed.
observeEvent(input$done, {
# Return the brushed points. See ?shiny::brushedPoints.
stopApp(brushedPoints(data, input$brush, allRows = TRUE))
})
}
runGadget(ui, server)
}
# pick_points(mtcars, ~wt, ~mpg)
brushed_points <- ggbrush(delays, "n", "delay")
brushed_points %>% ggplot(mapping = aes(x = n, y = delay, color = selected_)) +
geom_point(alpha = 1/10)
brushed_points %>% filter(selected_ ==TRUE) %>% ggplot(mapping = aes(x = n, y = delay, color = selected_)) +
geom_point(alpha = 1/3)
now wrangle you data to analyse it use
the data set nycflights13 has four tibbles (dataframes)
airlines
airports
planes
weather
first we need to join flights with planes
flight_planes <- left_join(flights, planes, by = "tailnum")
flight_planes %>% group_by(manufacturer) %>% summarise(delay_per_flight = sum(arr_delay, na.rm = TRUE)/ n(),number_of_flights = n()) %>% arrange(desc(delay_per_flight))
first we need to join flights with planes
flight_airlines <- left_join(flights, airlines)
flight_airlines %>% group_by(name) %>% summarise(delay_per_flight = sum(arr_delay, na.rm = TRUE)/ n(),number_of_flights = n()) %>% arrange(desc(delay_per_flight))
for some operations the tidy wide format is not suitable as input to an operation, then a “long” version of the data.frame can be generated using the “melt” command.
A further example will be shown in EuropeLeagueTransfers.Rmd and further information on the topic can be found at http://seananderson.ca/2013/10/19/reshape.html
Cast functions cast (deutsch: gießen) a molten data frame into an array or data frame. is the reverse function of melt and will be used in EuropeLeagueTransfers.Rmd
grepl returns a logic vector given an expression
letters
grep("[a-c]", letters)
grep("[a-z]", letters)
grepl("[a-c]", letters)
grepl("[a-z]", letters)
EuropeLeagueTransfers.Rmd